---
title: Supervised Machine Learning
subtitle: An Introduction using Decision Tree Learning
---

# Setup and Libraries

```{r}
#| eval: true
#| echo: true
#| output: false
library(tidyverse)
library(arrow)
library(rpart)
library(rpart.plot)
library(precrec)
library(caret)
library(kableExtra)
source("src/parttree-master/R/parttree.R")
source("src/parttree-master/R/geom_parttree.R")
```

# Synthetic Unemployment Data

```{r}
#| eval: true
#| echo: true
#| output: true
synthetic_unemployment_data <- read_parquet("data/synthetic_unemployment_data.parquet")

set.seed(123)

data <- synthetic_unemployment_data |>
  mutate(
    train_index = sample(
      c("train", "test"),
      nrow(synthetic_unemployment_data),
      replace=TRUE,
      prob=c(0.75, 0.25)
    )
  )

train <- data |>
  filter(train_index=="train")

test <- data |>
  filter(train_index=="test")
```

# Fitting Trees with RPART

```{r}
#| eval: true
#| echo: true
#| output: true
tree <- rpart(
  target_low ~ days_unemployment_2j + age + days_to_last_job,
  data = train |> select(-train_index, -target_high),
  cp = 0.007
  )

tree
```

```{r}
#| eval: true
#| echo: true
#| output: true
#| fig-height: 10
rpart.plot(tree, box.palette = "RdBu", nn = FALSE, type = 1)
```

```{r}
#| eval: true
#| echo: true
#| output: true
tree
```

# Confusion Matrix in R

```{r}
#| eval: true
#| echo: true
#| output: true
test$prediction_tree <- predict(
  tree,
  newdata = test,
  type = c("class")
  )

confusion <- confusionMatrix(
  data = test$prediction_tree,
  reference = test$target_low,
  positive = "successful",
  mode = "sens_spec"
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
confusion
```

# Cut-Off in R

```{r}
#| eval: true
#| echo: true
#| output: true
test$score_tree <- predict(
  tree,
  newdata = test,
  type = c("prob")
  )[,1]

test <- test |>
  mutate(prediction_tree = as.factor(ifelse(
    score_tree > 0.3 ,
    "successful",
    "unsuccessful"
    )))

confusion <- confusionMatrix(
  data = test$prediction_tree,
  reference = test$target_low,
  positive = "successful",
  mode = "sens_spec"
  )

confusion
```

# ROC- and PR-Curve in R

```{r}
#| eval: true
#| echo: true
#| output: true
test$prediction_tree_scores <- predict(tree, test, type = c("prob"))[,2]
test$prediction_random <- runif(n = nrow(test))

precrec_obj <- evalmod(
  scores = cbind(test$prediction_tree_scores, test$prediction_random),
  labels = cbind(test$target_low, test$target_low),
  modnames = c("classification tree", "random"),
  ties_method = "first"
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
#| fig-height: 3.7
autoplot(precrec_obj) &
  scale_color_manual(values = c("#9C6B91", "#336699", "#e52320")) &
  theme(
    panel.background = element_rect(fill = "transparent", colour = NA),
    plot.background = element_rect(fill = "transparent", colour = NA),
    legend.background = element_rect(fill = "transparent", colour = NA),
    text = element_text(size = 14, family = "Century Gothic")
    )
# precrec::auc(precrec_obj)
  # raw_curves = FALSE,
  # ties_method = "first"
```

# Train Control and Tuning Grid

```{r}
#| eval: true
#| echo: true
#| output: true
control <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 10,
  savePredictions = T,
  classProbs = T,
  summaryFunction = twoClassSummary
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
tuning_grid <- expand.grid(cp = c(0.0005, 0.001, 0.005, 0.05))
tuning_grid
```

# Training the Model

```{r}
#| eval: true
#| echo: true
#| output: true
tree_caret <- train(
  data = train |> select(-train_index, -target_high),
  target_low ~ days_unemployment_2j + age + days_to_last_job,
  method = "rpart",
  trControl = control,
  tuneGrid = tuning_grid,
  metric = "ROC",
  na.action = na.pass
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
tree_caret
```

# Extracting the Model

```{r}
#| eval: true
#| echo: true
#| output: true
#| fig-height: 4
tree <- tree_caret$finalModel
rpart.plot(tree, box.palette = "RdBu", nn = FALSE, type = 2)
```

# Predicting in Test Data

```{r}
#| eval: true
#| echo: true
#| output: true
test$prediction_caret <- predict.train(
  tree_caret,
  newdata = test,
  type = c("raw"),
  na.action = na.pass
  )

confusion <- confusionMatrix(
  test$target_low,
  test$prediction_caret,
  positive = "successful",
  mode = "sens_spec"
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
confusion
```

# Model Comparison

```{r}
#| eval: true
#| echo: true
#| output: true
test$prediction_caret_scores <- predict.train(
  tree_caret, 
  test, 
  type = c("prob"),
  na.action = na.pass
  )$unsuccessful

precrec_obj <- evalmod(
  scores = cbind(test$prediction_tree_scores, test$prediction_caret_scores),
  labels = cbind(test$target_low, test$target_low),
  modnames = c("classification tree", "classification tree (optimized)"),
  ties_method = "first"
  )
```

```{r}
#| eval: true
#| echo: true
#| output: true
#| fig-height: 3.7
autoplot(precrec_obj) &
  scale_color_manual(values = c("#9C6B91", "#336699", "#e52320")) &
  theme(
    panel.background = element_rect(fill = "transparent", colour = NA),
    plot.background = element_rect(fill = "transparent", colour = NA),
    legend.background = element_rect(fill = "transparent", colour = NA),
    text = element_text(size = 14, family = "Century Gothic")
    )
```
